tutorials/012 - CSV Crawler.ipynb (691 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 12 - CSV Crawler\n",
"\n",
"[awswrangler](https://github.com/aws/aws-sdk-pandas) can extract only the metadata from a Pandas DataFrame and then add it can be added to Glue Catalog as a table."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"import pandas as pd\n",
"\n",
"import awswrangler as wr"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ············\n"
]
}
],
"source": [
"import getpass\n",
"\n",
"bucket = getpass.getpass()\n",
"path = f\"s3://{bucket}/csv_crawler/\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating a Pandas DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>string</th>\n",
" <th>float</th>\n",
" <th>date</th>\n",
" <th>timestamp</th>\n",
" <th>bool</th>\n",
" <th>par0</th>\n",
" <th>par1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>foo</td>\n",
" <td>1.0</td>\n",
" <td>2020-01-01</td>\n",
" <td>2020-01-01 00:00:00</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>None</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>NaT</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>boo</td>\n",
" <td>2.0</td>\n",
" <td>2020-01-02</td>\n",
" <td>2020-01-02 00:00:01</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id string float date timestamp bool par0 par1\n",
"0 1 foo 1.0 2020-01-01 2020-01-01 00:00:00 True 1 a\n",
"1 2 None NaN None NaT None 1 b\n",
"2 3 boo 2.0 2020-01-02 2020-01-02 00:00:01 False 2 b"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ts = lambda x: datetime.strptime(x, \"%Y-%m-%d %H:%M:%S.%f\") # noqa\n",
"dt = lambda x: datetime.strptime(x, \"%Y-%m-%d\").date() # noqa\n",
"\n",
"df = pd.DataFrame(\n",
" {\n",
" \"id\": [1, 2, 3],\n",
" \"string\": [\"foo\", None, \"boo\"],\n",
" \"float\": [1.0, None, 2.0],\n",
" \"date\": [dt(\"2020-01-01\"), None, dt(\"2020-01-02\")],\n",
" \"timestamp\": [ts(\"2020-01-01 00:00:00.0\"), None, ts(\"2020-01-02 00:00:01.0\")],\n",
" \"bool\": [True, None, False],\n",
" \"par0\": [1, 1, 2],\n",
" \"par1\": [\"a\", \"b\", \"b\"],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extracting the metadata"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"columns_types, partitions_types = wr.catalog.extract_athena_types(\n",
" df=df, file_format=\"csv\", index=False, partition_cols=[\"par0\", \"par1\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': 'bigint',\n",
" 'string': 'string',\n",
" 'float': 'double',\n",
" 'date': 'date',\n",
" 'timestamp': 'timestamp',\n",
" 'bool': 'boolean'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns_types"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'par0': 'bigint', 'par1': 'string'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"partitions_types"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating the table"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"wr.catalog.create_csv_table(\n",
" table=\"csv_crawler\",\n",
" database=\"awswrangler_test\",\n",
" path=path,\n",
" partitions_types=partitions_types,\n",
" columns_types=columns_types,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column Name</th>\n",
" <th>Type</th>\n",
" <th>Partition</th>\n",
" <th>Comment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>id</td>\n",
" <td>bigint</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>string</td>\n",
" <td>string</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>float</td>\n",
" <td>double</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>date</td>\n",
" <td>date</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>timestamp</td>\n",
" <td>timestamp</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>bool</td>\n",
" <td>boolean</td>\n",
" <td>False</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>par0</td>\n",
" <td>bigint</td>\n",
" <td>True</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>par1</td>\n",
" <td>string</td>\n",
" <td>True</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column Name Type Partition Comment\n",
"0 id bigint False \n",
"1 string string False \n",
"2 float double False \n",
"3 date date False \n",
"4 timestamp timestamp False \n",
"5 bool boolean False \n",
"6 par0 bigint True \n",
"7 par1 string True "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.table(database=\"awswrangler_test\", table=\"csv_crawler\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We can still using the extracted metadata to ensure all data types consistence to new data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>string</th>\n",
" <th>float</th>\n",
" <th>date</th>\n",
" <th>timestamp</th>\n",
" <th>bool</th>\n",
" <th>par0</th>\n",
" <th>par1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2020-01-01</td>\n",
" <td>2020-01-02</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id string float date timestamp bool par0 par1\n",
"0 1 1 1 2020-01-01 2020-01-02 1 1 a"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" \"id\": [1],\n",
" \"string\": [\"1\"],\n",
" \"float\": [1],\n",
" \"date\": [ts(\"2020-01-01 00:00:00.0\")],\n",
" \"timestamp\": [dt(\"2020-01-02\")],\n",
" \"bool\": [1],\n",
" \"par0\": [1],\n",
" \"par1\": [\"a\"],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"res = wr.s3.to_csv(\n",
" df=df,\n",
" path=path,\n",
" index=False,\n",
" dataset=True,\n",
" database=\"awswrangler_test\",\n",
" table=\"csv_crawler\",\n",
" partition_cols=[\"par0\", \"par1\"],\n",
" dtype=columns_types,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## You can also extract the metadata directly from the Catalog if you want"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"dtype = wr.catalog.get_table_types(database=\"awswrangler_test\", table=\"csv_crawler\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"res = wr.s3.to_csv(\n",
" df=df,\n",
" path=path,\n",
" index=False,\n",
" dataset=True,\n",
" database=\"awswrangler_test\",\n",
" table=\"csv_crawler\",\n",
" partition_cols=[\"par0\", \"par1\"],\n",
" dtype=dtype,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checking out"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>string</th>\n",
" <th>float</th>\n",
" <th>date</th>\n",
" <th>timestamp</th>\n",
" <th>bool</th>\n",
" <th>par0</th>\n",
" <th>par1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" <td>2020-01-02</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>None</td>\n",
" <td>2020-01-02</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id string float date timestamp bool par0 par1\n",
"0 1 1 1.0 None 2020-01-02 True 1 a\n",
"1 1 1 1.0 None 2020-01-02 True 1 a"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = wr.athena.read_sql_table(database=\"awswrangler_test\", table=\"csv_crawler\")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id Int64\n",
"string string\n",
"float float64\n",
"date object\n",
"timestamp datetime64[ns]\n",
"bool boolean\n",
"par0 Int64\n",
"par1 string\n",
"dtype: object"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up S3"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"wr.s3.delete_objects(path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Up the Database"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=\"csv_crawler\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
}
},
"nbformat": 4,
"nbformat_minor": 4
}